{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\n实验目的：实现基于logistic/softmax regression的文本分类\\n\\n实验内容：\\n1）简单文本特征表示（词频、n-gram、TF-IDF）、特征组合与特征选择\\n2）模型相关：（随机）梯度下降、损失函数、学习率\\n3）实际操作：训练集/验证集/测试集的划分、shuffle\\n\\n补充：\\n1）更加复杂的句子级特征暂时没有构建\\n2）可以使用纯numpy实现，比较懒所以用现成的库了\\n\\n参考：\\nhttps://scikit-learn.org/stable/\\nhttp://sklearn.apachecn.org/#/\\nhttps://zhuanlan.zhihu.com/p/37157010\\n\\n'"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'''\n",
    "实验目的：实现基于logistic/softmax regression的文本分类\n",
    "\n",
    "实验内容：\n",
    "1）简单文本特征表示（词频、n-gram、TF-IDF）、特征组合与特征选择\n",
    "2）模型相关：（随机）梯度下降、损失函数、学习率\n",
    "3）实际操作：训练集/验证集/测试集的划分、shuffle\n",
    "\n",
    "补充：\n",
    "1）更加复杂的句子级特征暂时没有构建\n",
    "2）可以使用纯numpy实现，比较懒所以用现成的库了\n",
    "\n",
    "参考：\n",
    "https://scikit-learn.org/stable/\n",
    "http://sklearn.apachecn.org/#/\n",
    "https://zhuanlan.zhihu.com/p/37157010\n",
    "\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import torch\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import sklearn\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "from scipy.sparse import hstack\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "\n",
    "dir_all_data='data\\\\task1_all_data.tsv'\n",
    "#dir_all_data='D:\\\\workspace\\\\nlp_beginer_solution\\\\Task1\\\\data\\\\task1_all_data.tsv'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 读取数据与数据处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data_all.shape:  (156060, 4)\n",
      "data_all.keys:  Index(['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'], dtype='object')\n",
      "   PhraseId  SentenceId                                             Phrase  \\\n",
      "0         1           1  A series of escapades demonstrating the adage ...   \n",
      "1         2           1  A series of escapades demonstrating the adage ...   \n",
      "\n",
      "   Sentiment  \n",
      "0          1  \n",
      "1          2  \n"
     ]
    }
   ],
   "source": [
    "#读取数据 \n",
    "data_all=pd.read_csv(dir_all_data,sep='\\t')\n",
    "print(\"data_all.shape: \",data_all.shape)    #(156060, 4)\n",
    "print(\"data_all.keys: \",data_all.keys())   #['PhraseId', 'SentenceId', 'Phrase', 'Sentiment']\n",
    "print(data_all.head(2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(156060,)\n"
     ]
    }
   ],
   "source": [
    "#取出要处理的列\n",
    "x_all=data_all['Phrase']\n",
    "y_all=data_all['Sentiment']\n",
    "print(x_all.shape)   #(156060,)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(93636,) (31212,) (31212,)\n"
     ]
    }
   ],
   "source": [
    "#划分验证集、测试集\n",
    "from sklearn.model_selection import train_test_split\n",
    "x_train, x_test,y_train,  y_test = train_test_split(x_all,y_all, test_size=0.2)\n",
    "x_train, x_val,y_train,y_val = train_test_split(x_train,y_train, test_size=0.25)\n",
    "print(x_train.shape, x_val.shape, x_test.shape)   #(93636,) (31212,) (31212,)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "####  接下来要提取几个特征：文本计数特征、word级别的TF-IDF特征、ngram级别的TF-IDF特征      \n",
    "#### Then---------->合并特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(93636, 15183) (31212, 15183)\n"
     ]
    }
   ],
   "source": [
    "#提取文本计数特征 -- 每个单词的数量\n",
    "#对文本的单词进行计数，包括文本的预处理, 分词以及过滤停用词\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "count_vect = CountVectorizer()  \n",
    "x_train_counts = count_vect.fit_transform(x_train)\n",
    "x_test_counts = count_vect.transform(x_test)\n",
    "print(x_train_counts.shape,x_test_counts.shape)  #(93636, 15188) (31212, 15188)  矩阵(句子-词汇）的维度，词表大小15188\n",
    "\n",
    "#在词汇表中一个单词的索引值对应的是该单词在整个训练的文集中出现的频率。\n",
    "#print(count_vect.vocabulary_.get(u'good'))    #5812     count_vect.vocabulary_是一个词典：word-id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(93636, 15183) (31212, 15183)\n"
     ]
    }
   ],
   "source": [
    "#提取TF-IDF特征 -- word级别的TF-IDF\n",
    "#将各文档中每个单词的出现次数除以该文档中所有单词的总数：这些新的特征称之为词频tf。\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "tfidf_transformer = TfidfVectorizer(analyzer='word',max_features=50000)\n",
    "tfidf_transformer.fit(x_train)\n",
    "x_train_tfidf_word = tfidf_transformer.transform(x_train)\n",
    "x_test_tfidf_word = tfidf_transformer.transform(x_test)\n",
    "print(x_train_tfidf_word.shape,x_test_tfidf_word.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(93636, 50000) (31212, 50000)\n"
     ]
    }
   ],
   "source": [
    "#提取TF-IDF特征 - ngram级别的TF-IDF\n",
    "#将各文档中每个单词的出现次数除以该文档中所有单词的总数：这些新的特征称之为词频tf。\n",
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "tfidf_transformer = TfidfVectorizer(analyzer='word',ngram_range=(2,3),max_features=50000)\n",
    "tfidf_transformer.fit(x_train)\n",
    "x_train_tfidf_ngram = tfidf_transformer.transform(x_train)\n",
    "x_test_tfidf_ngram = tfidf_transformer.transform(x_test)\n",
    "print(x_train_tfidf_ngram.shape, x_test_tfidf_ngram.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(93636, 80366)\n"
     ]
    }
   ],
   "source": [
    "#合并特征（特征组合与特征选择）\n",
    "#from scipy.sparse import hstack\n",
    "train_features=x_train_counts\n",
    "test_features=x_test_counts\n",
    "\n",
    "train_features = hstack([x_train_counts,x_train_tfidf_word, x_train_tfidf_ngram])\n",
    "test_features = hstack([x_test_counts,x_test_tfidf_word ,x_test_tfidf_ngram])\n",
    "\n",
    "print(train_features.shape)   #特征的最终维度\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 模型构建与训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "#训练分类器\n",
    "\n",
    "#逻辑回归\n",
    "# from sklearn.linear_model import LogisticRegression\n",
    "# clf = LogisticRegression(random_state=0, \n",
    "#                          solver='sag', #优化算法：liblinear、lbfgs、newton-cg、sag\n",
    "#                          multi_class='multinomial' #分类方式：multinomial、ovr\n",
    "# )\n",
    "\n",
    "#朴素贝叶斯\n",
    "#from sklearn.naive_bayes import MultinomialNB\n",
    "#clf = MultinomialNB().fit(train_features, y_train)\n",
    "\n",
    "\n",
    "#SGDClassifier是一系列采用了梯度下降来求解参数的算法的集合，默认是SVM\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "clf = SGDClassifier(alpha=0.001,\n",
    "                    loss='log',    #hinge代表SVM，log是逻辑回归\n",
    "                    early_stopping=True,\n",
    "                    eta0=0.001,\n",
    "                    learning_rate='adaptive', #constant、optimal、invscaling、adaptive\n",
    "                    max_iter=100 \n",
    "                   )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SGDClassifier(alpha=0.001, average=False, class_weight=None,\n",
       "              early_stopping=True, epsilon=0.1, eta0=0.001, fit_intercept=True,\n",
       "              l1_ratio=0.15, learning_rate='adaptive', loss='log', max_iter=100,\n",
       "              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,\n",
       "              random_state=None, shuffle=True, tol=0.001,\n",
       "              validation_fraction=0.1, verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#打乱数据，训练\n",
    "from sklearn.utils import shuffle\n",
    "train_features,y_train=shuffle(train_features,y_train )\n",
    "\n",
    "clf.fit(train_features, y_train)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 测试过程"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "#测试过程\n",
    "predict = clf.predict(test_features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.5453992054338075\n"
     ]
    }
   ],
   "source": [
    "#测试集的评估\n",
    "print(np.mean(predict == y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "python36",
   "language": "python",
   "name": "python36"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
